This notebook walks through implementation of Image Classification Machine Learning Model to classify between 133 kinds of dog breeds using dog breed dataset provided by Udacity (https://s3-us-west-1.amazonaws.com/udacity-aind/dog-project/dogImages.zip)
# TODO: Install any packages that you might need
# For instance, you will need the smdebug package
!pip install smdebug
# TODO: Import any packages that you might need
# For instance you will need Boto3 and Sagemaker
import sagemaker
import boto3
from sagemaker.session import Session
from sagemaker import get_execution_role
# Initializing some useful variables
role = get_execution_role()
sagemaker_session = sagemaker.Session()
region = sagemaker_session.boto_region_name
bucket = sagemaker_session.default_bucket()
print(f"Region {region}")
print(f"Default s3 bucket : {bucket}")
The dataset we used for this project is the dogImages dataset that can be found in this link. It contains images of 133 dog breed split into train, valid and test folders each containing a sample of every breed. An example from the train folder s is ./dogImages/test/018.Beauceron/Beauceron_01284.jpg
#TODO: Fetch and upload the data to AWS S3
!wget https://s3-us-west-1.amazonaws.com/udacity-aind/dog-project/dogImages.zip
!unzip dogImages.zip > /dev/null
prefix ="dogImagesDataset"
print("Starting to upload dogImages")
inputs = sagemaker_session.upload_data(path="dogImages", bucket=bucket, key_prefix=prefix)
print(f"Input path ( S3 file path ): {inputs}")
inputs = 's3://sagemaker-us-east-1-881607171913/dogImagesDataset'
print(f"Input path ( S3 file path ): {inputs}")
The ResNet50 model with a two Fully connected Linear NN layer's is used for this image classification problem. ResNet-50 is 50 layers deep and is trained on a million images of 1000 categories from the ImageNet database. Furthermore the model has a lot of trainable parameters, which indicates a deep architecture that makes it better for image recognition The optimizer that we will be using for this model is AdamW ( For more info refer : https://pytorch.org/docs/stable/generated/torch.optim.AdamW.html ) Hence, the hyperparameters selected for tuning were: Learning rate - default(x) is 0.001 , so we have selected 0.01x to 100x range for the learing rate eps - defaut is 1e-08 , which is acceptable in most cases so we have selected a range of 1e-09 to 1e-08 Weight decay - default(x) is 0.01 , so we have selected 0.1x to 10x range for the weight decay Batch size -- selected only two values [ 64, 128 ]
#Importing all the required modules fomr tuner
from sagemaker.tuner import (
CategoricalParameter,
ContinuousParameter,
HyperparameterTuner
)
# We wil be using AdamW as an optimizer which uses a different( more correct or better) way to calulate the weight decay related computations
# So we will be using weight_decay and eps hyperparamter tuning as well , along with the lerning rate and batchsize params
hyperparameter_ranges = {
"lr": ContinuousParameter(0.0001, 0.1),
"eps": ContinuousParameter(1e-9, 1e-8),
"weight_decay": ContinuousParameter(1e-3, 1e-1),
"batch_size": CategoricalParameter([ 64, 128]),
}
objective_metric_name = "average test loss"
objective_type = "Minimize"
metric_definitions = [{"Name": "average test loss", "Regex": "Test set: Average loss: ([0-9\\.]+)"}]
from sagemaker.pytorch import PyTorch
estimator = PyTorch(
entry_point = "hpo.py",
base_job_name = "dog-breed-classification-hpo",
role = role,
instance_count = 1,
instance_type = "ml.g4dn.xlarge",
py_version = "py36",
framework_version = "1.8"
)
tuner = HyperparameterTuner(
estimator,
objective_metric_name,
hyperparameter_ranges,
metric_definitions,
max_jobs=4,
max_parallel_jobs=1,
objective_type=objective_type,
early_stopping_type="Auto"
)
# TODO: Fit your HP Tuner
tuner.fit({"training": inputs }, wait=True)
# Get the best estimators and the best HPs
best_estimator = tuner.best_estimator()
#Get the hyperparameters of the best trained model
best_estimator.hyperparameters()
best_hyperparameters={'batch_size': int(best_estimator.hyperparameters()['batch_size'].replace('"', "")),
'eps': best_estimator.hyperparameters()['eps'],
'lr': best_estimator.hyperparameters()['lr'],
'weight_decay': best_estimator.hyperparameters()['weight_decay'],}
print(f"Best Hyperparamters post Hyperparameter fine tuning are : \n {best_hyperparameters}")
# Setting up debugger and profiler rules and configs
from sagemaker.debugger import (
Rule,
rule_configs,
ProfilerRule,
DebuggerHookConfig,
CollectionConfig,
ProfilerConfig,
FrameworkProfile
)
rules = [
Rule.sagemaker(rule_configs.vanishing_gradient()),
Rule.sagemaker(rule_configs.overfit()),
Rule.sagemaker(rule_configs.overtraining()),
Rule.sagemaker(rule_configs.poor_weight_initialization()),
ProfilerRule.sagemaker(rule_configs.ProfilerReport()),
]
profiler_config = ProfilerConfig(
system_monitor_interval_millis=500, framework_profile_params=FrameworkProfile(num_steps=10)
)
collection_configs=[CollectionConfig(name="CrossEntropyLoss_output_0",parameters={
"include_regex": "CrossEntropyLoss_output_0", "train.save_interval": "10","eval.save_interval": "1"})]
debugger_config=DebuggerHookConfig( collection_configs=collection_configs )
# Create and fit an estimator
estimator = PyTorch(
entry_point="train_model.py",
instance_count=1,
instance_type="ml.g4dn.xlarge",
role=role,
framework_version="1.6", #using 1.6 as it has support for smdebug lib , https://github.com/awslabs/sagemaker-debugger#debugger-supported-frameworks
py_version="py36",
hyperparameters=best_hyperparameters,
profiler_config=profiler_config, # include the profiler hook
debugger_hook_config=debugger_config, # include the debugger hook
rules=rules
)
estimator.fit({'train' : inputs },wait=True)
#fetching jobname , client and description to be used for plotting.
job_name = estimator.latest_training_job.name
client = estimator.sagemaker_session.sagemaker_client
description = client.describe_training_job(TrainingJobName=estimator.latest_training_job.name)
print(f"Jobname: {job_name}")
print(f"Client: {client}")
print(f"Description: {description}")
from smdebug.trials import create_trial
from smdebug.core.modes import ModeKeys
#creating a trial
trial = create_trial(estimator.latest_job_debugger_artifacts_path())
trial.tensor_names() #all the tensor names
len(trial.tensor("CrossEntropyLoss_output_0").steps(mode=ModeKeys.TRAIN))
len(trial.tensor("CrossEntropyLoss_output_0").steps(mode=ModeKeys.EVAL))
#Defining some utility functions to be used for plotting tensors
import matplotlib.pyplot as plt
from mpl_toolkits.axes_grid1 import host_subplot
#utility function to get data from tensors
def get_data(trial, tname, mode):
tensor = trial.tensor(tname)
steps = tensor.steps(mode=mode)
vals = []
for s in steps:
vals.append(tensor.value(s, mode=mode))
return steps, vals
#plot tensor utility functions for plotting tensors
def plot_tensor(trial, tensor_name):
steps_train, vals_train = get_data(trial, tensor_name, mode=ModeKeys.TRAIN)
print("loaded TRAIN data")
steps_eval, vals_eval = get_data(trial, tensor_name, mode=ModeKeys.EVAL)
print("loaded EVAL data")
fig = plt.figure(figsize=(10, 7))
host = host_subplot(111)
par = host.twiny()
host.set_xlabel("Steps (TRAIN)")
par.set_xlabel("Steps (EVAL)")
host.set_ylabel(tensor_name)
(p1,) = host.plot(steps_train, vals_train, label=tensor_name)
print("Completed TRAIN plot")
(p2,) = par.plot(steps_eval, vals_eval, label="val_" + tensor_name)
print("Completed EVAL plot")
leg = plt.legend()
host.xaxis.get_label().set_color(p1.get_color())
leg.texts[0].set_color(p1.get_color())
par.xaxis.get_label().set_color(p2.get_color())
leg.texts[1].set_color(p2.get_color())
plt.ylabel(tensor_name)
plt.show()
#plotting the tensor
plot_tensor(trial, "CrossEntropyLoss_output_0")
# TODO: Display the profiler output
rule_output_path = estimator.output_path + estimator.latest_training_job.job_name + "/rule-output"
print(f"Profiler report location: {rule_output_path}")
! aws s3 ls {rule_output_path} --recursive
! aws s3 cp {rule_output_path} ./ --recursive
import os
# get the autogenerated folder name of profiler report
profiler_report_name = [
rule["RuleConfigurationName"]
for rule in estimator.latest_training_job.rule_job_summary()
if "Profiler" in rule["RuleConfigurationName"]
][0]
import IPython
IPython.display.HTML(filename=profiler_report_name + "/profiler-output/profiler-report.html")
# Zipping the ProfilerReport inorder to export and upload it later for submission
import shutil
shutil.make_archive("./profiler_report", "zip", "ProfilerReport")
# TODO: Deploy your model to an endpoint
predictor = estimator.deploy(initial_instance_count=1, instance_type="ml.m5.xlarge")
from sagemaker.pytorch import PyTorchModel
from sagemaker.predictor import Predictor
#Below is the s3 location of our saved model that was trained by the training job using the best hyperparameters
model_data_artifacts = "s3://sagemaker-us-east-1-881607171913/pytorch-training-220607-1708-003-de60ee22/output/model.tar.gz"
#We need to define the serializer and deserializer that we will be using as default for our Prediction purposes
jpeg_serializer = sagemaker.serializers.IdentitySerializer("image/jpeg")
json_deserializer = sagemaker.deserializers.JSONDeserializer()
#If we need to override the serializer and deserializer then we need to pass them in an class inheriting the Predictor class and pass this class as parameter to our PyTorchModel
class ImgPredictor(Predictor):
def __init__( self, endpoint_name, sagemaker_session):
super( ImgPredictor, self).__init__(
endpoint_name,
sagemaker_session = sagemaker_session,
serializer = jpeg_serializer,
deserializer = json_deserializer
)
pytorch_model = PyTorchModel( model_data = model_data_artifacts,
role = role,
entry_point= "endpoint_inference.py",
py_version = "py36",
framework_version = "1.6",
predictor_cls = ImgPredictor
)
predictor = pytorch_model.deploy( initial_instance_count = 1, instance_type = "ml.t2.medium") #Using ml.t2.medium to save costs
#Testing the deployed endpoint using some test images
#Solution 1: Using the Predictor object directly.
from PIL import Image
import io
import os
import numpy as np
test_dir = "./dogImages/test/129.Tibetan_mastiff/"
test_images = ["Tibetan_mastiff_08158.jpg", "Tibetan_mastiff_08139.jpg", "Tibetan_mastiff_08138.jpg"]
test_images_expected_output = [129, 5, 21 ]
for index in range(len(test_images) ):
test_img = test_images[index]
expected_breed_category = test_images_expected_output[index]
print(f"Test image no: {index+1}")
test_file_path = os.path.join(test_dir,test_img)
with open(test_file_path , "rb") as f:
payload = f.read()
print("Below is the image that we will be testing:")
display(Image.open(io.BytesIO(payload)))
print(f"Expected dog breed category no : {expected_breed_category}")
response = predictor.predict(payload, initial_args={"ContentType": "image/jpeg"})
print(f"Response: {response}")
predicted_dog_breed = np.argmax(response,1) + 1 #We need to do plus 1 as index starts from zero and prediction is zero-indexed
print(f"Response/Inference for the above image is : {predicted_dog_breed}")
print("----------------------------------------------------------------------")
print(predictor.endpoint_name)
endpoint_name = predictor.endpoint_name
# Solution 2: Using boto3
# Using the runtime boto3 client to test the deployed models endpoint
import os
import io
import boto3
import json
import base64
import PIL
# setting the environment variables
ENDPOINT_NAME = endpoint_name
# We will be using the AWS's lightweight runtime solution to invoke an endpoint.
runtime= boto3.client('runtime.sagemaker')
test_dir = "./dogImages/test/129.Tibetan_mastiff/"
test_images = ["Tibetan_mastiff_08158.jpg", "Tibetan_mastiff_08139.jpg", "Tibetan_mastiff_08138.jpg"]
test_images_expected_output = [129, 5, 21 ]
for index in range(len(test_images) ):
test_img = test_images[index]
expected_breed_category = test_images_expected_output[index]
print(f"Test image no: {index+1}")
test_file_path = os.path.join(test_dir,test_img)
with open(test_file_path , "rb") as f:
payload = f.read()
print("Below is the image that we will be testing:")
display(Image.open(io.BytesIO(payload)))
print(f"Expected dog breed category no : {expected_breed_category}")
response = runtime.invoke_endpoint(EndpointName=ENDPOINT_NAME,
ContentType='image/jpeg',
Body=payload)
response_body = np.asarray(json.loads( response['Body'].read().decode('utf-8')))
print(f"Response: {response_body}")
predicted_dog_breed = np.argmax(response_body,1) + 1 #We need to do plus 1 as index starts from zero and prediction is zero-indexed
print(f"Response/Inference for the above image is : {predicted_dog_breed}")
# TODO: Remember to shutdown/delete your endpoint once your work is done
predictor.delete_endpoint()